

### -------------------------------------------------- 
### ---- General Set Up ---- 
### -------------------------------------------------- 

### Clear global environment
rm(list=ls()) 


### Libraries:
library(pacman)
pacman::p_unload(all)
pacman::p_load(tidyverse, data.table, lazyeval, magrittr, lubridate, readstata13,
       textclean, here)


### File Location (R project only)
here::i_am("Survey and App Match/107_generate_state_flags.R")
file_path <- here()
file_path_data_matched <- here("Survey and App Match/Temp_Data/")


### Run file with functions for downloading VF files
source(paste0(file_path,"/Functions/functions_flag_states.R"))


### -------------------------------------------------- 
### ---- Download TFA Data Matched to State ----
### -------------------------------------------------- 

### Load tfa_dat_match from file #6
load(paste0(file_path_data_matched,"/tfa_to_state_ALL_v2.RData")) 


### -------------------------------------------------- 
### ---- Flag Obs Matched to State ----
### -------------------------------------------------- 

### -------------------------------------------------- 
### Create flags (dummy vars) for states
### -------------------------------------------------- 

### Create new copy of the tfa data
tfa_dat_flag <- tfa_dat_match

### Input
# List of state abbreviations 
data(state) #Download list of state abbreviations
state_abb_w_DC <- c("DC",state.abb) #Add DC to list of state abbreviations


# List variable names with state info (used in loop)
var_names <- c("state_ca", "state_uni", "state_st", 
               "state_mr", "state_cell")

### Run Loop 
for(s in 1:51){ #Run loop for all 50 states plus DC
  
  choose_ABB <- state_abb_w_DC[s] #Assign variable with one of 51 abbreviations
  
  for(v in 1:length(var_names)){ #Run for each of the 7 variables in var_names above
    # Function creates indicator variable for whether variable v equals a state s
    tfa_dat_flag <- fun_state_variable_flag(dat = tfa_dat_flag,
                                            ABB = choose_ABB,
                                            choose_var = var_names[v])
  }
  # Function creates (1) sum of total vars in var_names above that equal state s 
  # (2) indicator variable for whether any variable v equals a state s
  tfa_dat_flag <- fun_state_flag(dat = tfa_dat_flag,
                                 ABB = choose_ABB)
}


### -------------------------------------------------- 
### ---- Clean Vars to match VF ----
### -------------------------------------------------- 

temp <- tfa_dat_flag

### -------------------------------------------------- 
### ***Names ----
### -------------------------------------------------- 

### Clean names
tfa_dat_flag %<>% 
  mutate(FirstName = toupper(str_replace_all(firstname,"[^[:graph:][:space:]]", ""))) %>%
  mutate(LastName = toupper(str_replace_all(lastname, "[^[:graph:][:space:]]", "")))


### -------------------------------------------------- 
### ***Gender ----
### -------------------------------------------------- 

### Check gender vars
tfa_dat <- tfa_dat_flag

### Recode Sex/gender to match vf
tfa_dat_flag %<>%
  mutate(Sex = NA) %>%
  mutate(Sex = ifelse(gender=="MALE","M",Sex) ) %>%
  mutate(Sex = ifelse(gender=="FEMALE","F",Sex) ) %>%
  mutate(Sex = case_when(gender==""&dem3==1~"M",
                         gender==""&dem3==2~"F",
                         T~Sex))


### -------------------------------------------------- 
### ***Dates (DoB and Graduation) ----
### -------------------------------------------------- 

### Clean graduation date
tfa_dat_flag %<>%
  # Reformat degree date (assume in 2000s if < 20)
  mutate(new_degreedate = as.Date(degreedate, "%d-%b-%y") ) %>%
  mutate(DegYear = strftime(new_degreedate, "19%y")) %>%
  mutate(DegYear = ifelse(as.numeric(DegYear)<1920,strftime(new_degreedate, "20%y"),DegYear)) %>%
  mutate(DegMonth = strftime(new_degreedate, "%m")) %>%
  mutate(DegDay = strftime(new_degreedate, "%d")) %>%
  # Verify that vars are saved as character vars
  mutate_at(vars(DegYear, DegMonth, DegDay), as.character)

### Clean birthdays
tfa_dat_flag %<>%
  # Reformat birth date (assume in 2000s if < 20)
  mutate(new_dateofbirth = as.Date(dateofbirth, "%d-%b-%y") ) %>%
  mutate(BirthYear = strftime(new_dateofbirth, "19%y")) %>%
  mutate(BirthYear = ifelse(as.numeric(BirthYear)<1920,strftime(new_dateofbirth, "20%y"),BirthYear)) %>%
  mutate(BirthMonth = strftime(new_dateofbirth, "%m")) %>%
  mutate(BirthDay = strftime(new_dateofbirth, "%d")) %>%
  # Save composit
  mutate(BirthDate_full = ifelse(!is.na(BirthMonth)&!is.na(BirthDay)&!is.na(BirthYear),
                                 as.character(paste0(BirthMonth,BirthDay,BirthYear)),NA)) %>%
  # Drop if Birth Date is before 1938
  filter(as.numeric(BirthYear)>1937 | is.na(BirthYear)) %>%
  # Replace BirthYear with NA if diff btw DegreeYear & BirthYear < 13
  mutate(BirthYear = ifelse((as.numeric(DegYear)-as.numeric(BirthYear))<13,NA,BirthYear)) %>%
  # Verify that Birth vars are saved as character vars
  mutate_at(vars(BirthYear, BirthMonth, BirthDay), as.character) %>%
  rename(birthyear_app = BirthYear,
         birthmonth_app = BirthMonth,
         birthday_app = BirthDay)

### Rename survey responses for birth year
tfa_dat_flag %<>% mutate(dob_survey=pre3_0)

### Find most common difference between DegYear and DOB from survey (=22)
tfa_dat_flag %>% 
  mutate_at(vars(DegYear, dob_survey), ~as.numeric(as.character(.))) %>% 
  mutate(diff = DegYear-dob_survey) %>% 
  group_by(diff) %>% summarise(n=n()) %>% arrange(desc(n))

tfa_dat_flag %>% 
  mutate_at(vars(DegYear, dob_survey), ~as.numeric(as.character(.))) %>% 
  mutate(diff = DegYear-dob_survey) %>%
  ggplot(aes(x=diff)) + geom_histogram(binwidth = 1) + theme_minimal()

### Use DegYear-22 as a proxy for birthyear
tfa_dat_flag %<>% 
  mutate(BirthYear = ifelse(appyear %in% c(2007:2009, 2014, 2015), 
                            as.numeric(birthyear_app), 
                            as.numeric(DegYear)-22)) %>%
  mutate(BirthYear=as.character(BirthYear))


### -------------------------------------------------- 
### ***Phone Numbers ----
### -------------------------------------------------- 

### Clean phone numbers
tfa_dat_flag %<>%
  mutate(homephone_clean = gsub(" ", "", homephone)) %>%
  mutate(workphone_clean = gsub(" ", "", workphone)) %>%
  mutate(cellphone_clean = gsub(" ", "", cellphone)) %>%
  mutate(homephone_clean = textclean::strip(homephone_clean, digit.remove=FALSE, char.keep=NULL)) %>%
  mutate(workphone_clean = textclean::strip(workphone_clean, digit.remove=FALSE, char.keep=NULL)) %>%
  mutate(cellphone_clean = textclean::strip(cellphone_clean, digit.remove=FALSE, char.keep=NULL)) %>%
  mutate(homephone_area_code = str_sub(homephone_clean, start=1, end=3)) %>%
  mutate(workphone_area_code = str_sub(workphone_clean, start=1, end=3)) %>%
  mutate(cellphone_area_code = str_sub(cellphone_clean, start=1, end=3)) %>%
  mutate(homephone_number = str_sub(homephone_clean, start=4, end=10)) %>%
  mutate(workphone_number = str_sub(workphone_clean, start=4, end=10)) %>%
  mutate(cellphone_number = str_sub(cellphone_clean, start=4, end=10)) %>%
  mutate(homephone_area_code = ifelse(nchar(homephone_clean)>10,NA,homephone_area_code)) %>%
  mutate(homephone_area_code = ifelse(nchar(homephone_clean)==11 & str_sub(homephone_clean, start=1, end=1) == "1",
                                      str_sub(homephone_clean, start=2, end=4),homephone_area_code)) %>% 
  mutate(homephone_area_code = ifelse(nchar(homephone_clean)==12 & str_sub(homephone_clean, start=1, end=2) == "01",
                                      str_sub(homephone_clean, start=3, end=5),homephone_area_code)) %>% 
  mutate(homephone_number = ifelse(nchar(homephone_clean)>10,NA,homephone_number)) %>%
  mutate(homephone_number = ifelse(nchar(homephone_clean)==11 & str_sub(homephone_clean, start=1, end=1) == "1",
                                   str_sub(homephone_clean, start=5, end=11),homephone_number)) %>% 
  mutate(homephone_number = ifelse(nchar(homephone_clean)==12 & str_sub(homephone_clean, start=1, end=2) == "01",
                                   str_sub(homephone_clean, start=6, end=12),homephone_number)) %>%
  mutate(workphone_area_code = ifelse(nchar(workphone_clean)>10,NA,workphone_area_code)) %>%
  mutate(workphone_area_code = ifelse(nchar(workphone_clean)==11 & str_sub(workphone_clean, start=1, end=1) == "1",
                                      str_sub(workphone_clean, start=2, end=4),workphone_area_code)) %>% 
  mutate(workphone_area_code = ifelse(nchar(workphone_clean)==12 & str_sub(workphone_clean, start=1, end=2) == "01",
                                      str_sub(workphone_clean, start=3, end=5),workphone_area_code)) %>% 
  mutate(workphone_number = ifelse(nchar(workphone_clean)>10,NA,workphone_number)) %>%
  mutate(workphone_number = ifelse(nchar(workphone_clean)==11 & str_sub(workphone_clean, start=1, end=1) == "1",
                                   str_sub(workphone_clean, start=5, end=11),workphone_number)) %>% 
  mutate(workphone_number = ifelse(nchar(workphone_clean)==12 & str_sub(workphone_clean, start=1, end=2) == "01",
                                   str_sub(workphone_clean, start=6, end=12),workphone_number)) %>%
  mutate(cellphone_area_code = ifelse(nchar(cellphone_clean)>10,NA,cellphone_area_code)) %>%
  mutate(cellphone_area_code = ifelse(nchar(cellphone_clean)==11 & str_sub(cellphone_clean, start=1, end=1) == "1",
                                      str_sub(cellphone_clean, start=2, end=4),cellphone_area_code)) %>% 
  mutate(cellphone_area_code = ifelse(nchar(cellphone_clean)==12 & str_sub(cellphone_clean, start=1, end=2) == "01",
                                      str_sub(cellphone_clean, start=3, end=5),cellphone_area_code)) %>% 
  mutate(cellphone_number = ifelse(nchar(cellphone_clean)>10,NA,cellphone_number)) %>%
  mutate(cellphone_number = ifelse(nchar(cellphone_clean)==11 & str_sub(cellphone_clean, start=1, end=1) == "1",
                                   str_sub(cellphone_clean, start=5, end=11),cellphone_number)) %>% 
  mutate(cellphone_number = ifelse(nchar(cellphone_clean)==12 & str_sub(cellphone_clean, start=1, end=2) == "01",
                                   str_sub(cellphone_clean, start=6, end=12),cellphone_number))

### Standardize TFA Zipcodes
  
# Fix Unicode Errors
tfa_dat_flag[8749,c("personid","Current_Address.Zip")] #personid=1728746
tfa_dat_flag %<>% mutate(Current_Address.Zip=ifelse(personid=="1728746", "21224", Current_Address.Zip))
tfa_dat_flag[13653,c("personid","Current_Address.Zip")] #personid=1871883
tfa_dat_flag %<>% mutate(Current_Address.Zip=ifelse(personid=="1871883", "94115", Current_Address.Zip))
tfa_dat_flag[39876,c("personid","Current_Address.Zip")] #personid=2577590
tfa_dat_flag %<>% mutate(Current_Address.Zip=ifelse(personid=="2577590", NA, Current_Address.Zip))

# Clean
tfa_dat_flag %<>%
  mutate(Current_Address.Zip = ifelse(trimws(Current_Address.Zip)=="-",NA,Current_Address.Zip)) %>% 
  mutate(Permanent_Address.Zip = ifelse(trimws(Permanent_Address.Zip)=="-",NA,Permanent_Address.Zip)) %>%
  mutate(Current_Address.Zip = ifelse(trimws(Current_Address.Zip)=="N/A",NA,Current_Address.Zip)) %>% 
  mutate(Permanent_Address.Zip = ifelse(trimws(Permanent_Address.Zip)=="N/A",NA,Permanent_Address.Zip)) %>%
  mutate(Current_Address.Zip_old = Current_Address.Zip,
         Permanent_Address.Zip_old = Permanent_Address.Zip) %>%
  mutate(Current_Address.Zip = ifelse(nchar(trimws(Current_Address.Zip))==10, 
                                      str_sub(Current_Address.Zip, start=1, end=5), 
                                              Current_Address.Zip)) %>%
  mutate(Permanent_Address.Zip = ifelse(nchar(trimws(Permanent_Address.Zip))==10, 
                                        str_sub(Permanent_Address.Zip, start=1, end=5), 
                                                Permanent_Address.Zip)) %>%
  mutate(Current_Address.Zip = ifelse(nchar(trimws(Current_Address.Zip))<5, 
                                      formatC(as.numeric(Current_Address.Zip), width=5, flag="0"), 
                                      Current_Address.Zip)) %>%
  mutate(Permanent_Address.Zip = ifelse(nchar(trimws(Permanent_Address.Zip))<5, 
                                        formatC(as.numeric(Permanent_Address.Zip), width=5, flag="0"),  
                                                Permanent_Address.Zip)) 

### Standardize TFA Dataset
tfa_dat_flag %<>%
  mutate(LandLine_AreaCode = ifelse(trimws(homephone_area_code)=="",NA,homephone_area_code)) %>% 
  mutate(LandLine_Number = ifelse(trimws(homephone_number)=="",NA,homephone_number)) %>%
  mutate(CellPhone_AreaCode = ifelse(trimws(cellphone_area_code)=="",NA,cellphone_area_code)) %>%
  mutate(CellPhone_Number = ifelse(trimws(cellphone_number)=="",NA,cellphone_number)) %>%
  mutate(Current_Address.Zip = ifelse(trimws(Current_Address.Zip)=="",NA,Current_Address.Zip)) %>% 
  mutate(Permanent_Address.Zip = ifelse(trimws(Permanent_Address.Zip)=="",NA,Permanent_Address.Zip)) 


### -------------------------------------------------- 
### ---- CHUNKING VARS ----
### -------------------------------------------------- 

tfa_dat_flag %<>%
  mutate(first_letter = str_to_upper(str_sub(LastName, start=1, end=1)))


### -------------------------------------------------- 
### ---- Save Dataset ----
### -------------------------------------------------- 

### Save tfa_dat_flag
save(tfa_dat_flag, 
     file=paste0(file_path_data_matched,"/tfa_to_state_ALL_flags_v2.RData")) #changed name 

### Save stata version for final analysis 
tfa_dat_flag_stata <- tfa_dat_flag %>%
  select(-starts_with('state_st_')) %>%
  select(-matches('^state_.*_total$')) %>%
  select(-matches('^state_.*_flag$')) 

tfa_dat_flag_stata$new_dateofbirth %>% is.na %>% sum

temp <- tfa_dat_flag
names(temp) <- gsub("\\.", "_", names(temp))
names(temp)[names(temp) == 'Calculated_Prospect_Type_cleaned'] <- 'Calculated_Prospect_Type_clean'
haven::write_dta(temp, paste0(file_path_data_matched,"/tfa_to_state_ALL_flags_v2.dta"), label = NULL)

